The subsequent investigation is related to the hypothesis of the Environmental Kuznets Curve (EKC) which states that emissions decline with increasing GDP as soon as a certain economic wealth is reached. Hence, it assumes a quadratic relationship between GDP and emission.
If this hypothesis holds, the established saying of “Green growth” might be feasible. However, the following analysis of World Bank data from 2006-2018 indicates that the EKC hypothesis has to be rejected.
The investigation is structured as follows. First, there is an animated graph followed by an interactive world map which represents a descriptive analysis to visualize the underlying data. It is continued by a regression analysis and a brief machine learning chapter two show possible examination methods.

Animated Graph

Used Packages - gganimate, ggplot2, dplyr, ggthemes, tidyverse, countrycode, plotly

########### Loading & Checking Data #######
world_bank <- read.csv("world_bank.csv")



######## Data Preparation #####################

world_bank <- world_bank %>%
  rename(pop=Population..total..SP.POP.TOTL.,
         gdpPercap=GDP.per.capita..PPP..constant.2017.international.....NY.GDP.PCAP.PP.KD.,
         CO2_PerCap=CO2.emissions..metric.tons.per.capita...EN.ATM.CO2E.PC.,
         year=ï..Time)

####### Getting continent variable ######

world_bank$continent <- countrycode(sourcevar = world_bank[, "Country.Code"],
                            origin = "genc3c",
                            destination = "continent")



####### Adjusting the variables ########

world_bank$pop <- as.integer(world_bank$pop)
world_bank$pop_million <- world_bank$pop/1000000


world_bank$gdpPercap <- as.numeric(world_bank$gdpPercap)
world_bank$gdpPercap_1000 <- world_bank$gdpPercap/1000



world_bank$year <- as.integer(world_bank$year)

world_bank$CO2_PerCap <- as.numeric(world_bank$CO2_PerCap)

world_bank_complete <- na.omit(world_bank)


###### starting the plot ######
world_bank_graph <- world_bank_complete %>% 
  ggplot(aes(x=gdpPercap_1000, y=CO2_PerCap, color=continent, size=pop_million, label=Country.Name)) +
  geom_point(alpha=0.7) +
  geom_text(data = subset(world_bank_complete, 
                          Country.Code=="USA"|Country.Code=="CHN"),
            show.legend = FALSE)+
  theme_fivethirtyeight() +
  guides(color = guide_legend(override.aes = list(size = 4)))+
  labs(title = expression("CO2 Emission with GDP per Capita by Country"),
       x = "GDP per capita",
       y="CO2 emission per capita",
       caption = "Source: World Bank",
       color="Continent",
       size="Population in Mio")+
  theme(axis.title = element_text(),
        text = element_text(family = "Rubik"),
        legend.text = element_text(family ="Rubik",size = 11),
        legend.key.size = unit(.5, "cm")) +
  scale_color_brewer(palette = "Set2")
world_bank_graph

world_bank_graph.animation <- world_bank_graph +
  transition_time(year) +
  labs(subtitle =" Year: {frame_time}")
  

world_bank_graph.animation

Spatial Graph

####### PLOT ########

#### Add hover variable (used later) ####
world_bank_complete <- world_bank_complete %>%
    mutate(hover = paste(Country.Name, "\n",
                       lapply(world_bank_complete[,5], round, 2), "CO2 tons per capita", "\n",
                       lapply(world_bank_complete[,10], round,2), "GDP per capita in 1000$"))




world_co2_graph <- plot_geo(world_bank_complete,
                            frame =~year) %>%
  add_trace(locations = ~Country.Code,
            z=~CO2_PerCap,
            zmin=0,
            zmax=20,
            color=~CO2_PerCap,
            text=~hover,
            hoverinfo=~"text",
            colorscale="Inferno") %>%
  layout(geo=list(projection = list( type = "orthographic")),
         font = list(family="Rubik Black", size=11),
    title="CO2 emission in metric tons per capita 2006-2018")%>%
  colorbar(title="CO2 in t") %>%
  config(displayModeBar=FALSE)
            

world_co2_graph

Regression Analysis

Used Packages - h2o, ranger, plm, sjPlot, PanJen

# Creation of continent dummys
world_bank_complete$continent <- as.factor(world_bank_complete$continent)

# Creating panel data
panel_worldbank_data <- pdata.frame(world_bank_complete,c("Country.Code","year"))


# Find the best specification for the CO2 model
form<-formula(log(CO2_PerCap) ~ pop_million + continent, 
              data = panel_worldbank_data)

fxlist= list(
  linear = function(x) x,
  sqr = function(x) x^2,
  poly = function(x) x^3,
  log = function(x) log(x),
  inverse = function(x) 1/(x)
)

model<-choose.fform(data=panel_worldbank_data,variable="gdpPercap_1000",
                    base_form=form, functionList=fxlist)
##               AIC     BIC ranking (BIC)
## smoothing 3708.26 3800.24             1
## log       3890.91 3937.02             2
## inverse   5149.81 5195.91             3
## linear    6344.19 6390.29             4
## sqr       7038.23 7084.33             5
## poly      7248.77 7294.87             6
## base      7445.35 7485.69             7
## [1] "Smoothing is a semi-parametric and data-driven transformation, please see Wood (2006) for an elaboration"
### Random and Fixed effects estimation #####
random_effectes_reg <- plm(log(CO2_PerCap) ~ log(gdpPercap_1000) + pop_million, # + continent, 
                          data = panel_worldbank_data, model="random", index = c("Country.Code","year"))

fixed_effectes_reg <- plm(log(CO2_PerCap) ~ log(gdpPercap_1000) + pop_million, # + continent, 
                           data = panel_worldbank_data, model="within", index = c("Country.Code","year"))

##### Hausmanntest ###
phtest(fixed_effectes_reg,random_effectes_reg)
## 
##  Hausman Test
## 
## data:  log(CO2_PerCap) ~ log(gdpPercap_1000) + pop_million
## chisq = 108.1, df = 2, p-value < 2.2e-16
## alternative hypothesis: one model is inconsistent
# RE in inconsistent

#### Setting up correlated random effects model ####
#### Taking the mean of the time varying variables ####

#  Average per Country of population in Million #
aggre_table<-aggregate(panel_worldbank_data$pop_million, 
                       by=list(panel_worldbank_data$Country.Code), FUN=mean)
for(i in unique(panel_worldbank_data$Country.Code)){
  panel_worldbank_data$pop_million_mean[panel_worldbank_data$Country.Code==i]<-aggre_table[aggre_table[,1]==i,2]}


#  Average per Country of GDP per Capita #
aggre_table<-aggregate(panel_worldbank_data$gdpPercap_1000, 
                       by=list(panel_worldbank_data$Country.Code), FUN=mean)
for(i in unique(panel_worldbank_data$Country.Code)){
  panel_worldbank_data$gdpPercap_1000_mean[panel_worldbank_data$Country.Code==i]<-aggre_table[aggre_table[,1]==i,2]}



### Model estimation of Correlated Random effects ####
corr_random_effectes_reg <- plm(log(CO2_PerCap) ~ log(gdpPercap_1000) + pop_million + continent 
                                  + gdpPercap_1000_mean, 
                           data = panel_worldbank_data, model="random")

# Pop_milion_mean is taken out due to multicollinarity. gdpPercap_1000_mean is significant, hence
# CRE estimation is necessary as predicted by the Hausmann test

tab_model(corr_random_effectes_reg, file="output.html", 
          title="Correlated Random Effects", vcov.type = "HC3", robust=TRUE)
Correlated Random Effects
  log(CO2_PerCap)
Predictors Estimates CI p
(Intercept) -1.95 -2.23 – -1.67 <0.001
gdpPercap_1000 [log] 0.86 0.68 – 1.04 <0.001
pop_million 0.00 -0.00 – 0.00 0.130
continent [Americas] 0.49 0.19 – 0.78 0.001
continent [Asia] 0.67 0.32 – 1.03 <0.001
continent [Europe] 0.52 0.16 – 0.87 0.004
continent [Oceania] 0.88 0.53 – 1.23 <0.001
gdpPercap_1000_mean 0.01 0.00 – 0.02 0.046
Observations 2352
R2 / R2 adjusted 0.455 / 0.453

Machine Learning

Used Packages - randomForest, randomUniformForest, plotmo, gbm

######################################################
###### Defining Functions for Machine Learning #######
###### data partitioning ###################
TVHsplit<-function(df, split = c(0.5, 0.5),
                   labels = c("T", "V"), iseed = 1176){
  set.seed(iseed)
  flags <- sample(labels, size = nrow(df),
                  prob = split, replace = TRUE)
  return(flags)
}

#### Performance Test ######
ValidationRsquared<-function(validObs, validHat){
  resids <- validHat - validObs
  yBar <- mean(validObs)
  offset <- validObs - yBar
  num <- sum(resids^2)
  denom <- sum(offset^2)
  Rsq <- 1 - num/denom
  return(Rsq)
}
# End of defining functions

### generate a training and a validation dataset ####
WorldBankFlag <- TVHsplit(world_bank_complete, split = c(0.7, 0.3),
                         labels = c("T", "V"))
WorldBankTrain <- world_bank_complete[which(WorldBankFlag == "T"), ]
WorldBankValid <- world_bank_complete[which(WorldBankFlag == "V"), ]


# Machine learning estimation - Random Forest
rfCO2_model<-randomForest(CO2_PerCap ~ gdpPercap_1000 + year + pop_million + continent + Country.Name, 
                          data = WorldBankTrain, importance = TRUE)

# Performance Test
rfCO2_modelHatV <- predict(rfCO2_model, newdata = WorldBankValid)
ValidationRsquared(WorldBankValid$CO2_PerCap, rfCO2_modelHatV)
## [1] 0.8715406
# Visualization of Machine Learning Estimation
plot(rfCO2_model)

varImpPlot(rfCO2_model)

# compare predictions to actual results
prediction_results_forest <- data.frame(WorldBankValid$CO2_PerCap, rfCO2_modelHatV, 
                                 WorldBankValid$Country.Name, WorldBankValid$year) 
colnames(prediction_results_forest) <- c("Actual CO2 pp", "Forest prediction CO2 pp", 
                                      "Country", "Year")
head(prediction_results_forest, n=10L)
##    Actual CO2 pp Forest prediction CO2 pp                  Country Year
## 7      5.3194705                6.6282075      Antigua and Barbuda 2006
## 12     8.9615694                9.1234644                  Austria 2006
## 16     0.2547524                1.7030228               Bangladesh 2006
## 17     5.2977746                6.9624355                 Barbados 2006
## 32     0.0249742                0.6580490                  Burundi 2006
## 35     0.3813720                0.7026253                 Cameroon 2006
## 36    16.6119281               12.2180342                   Canada 2006
## 38     0.0631363                0.8834994 Central African Republic 2006
## 41     3.4883351                4.2863329                    Chile 2006
## 43     1.3360832                2.5163303                 Colombia 2006
##########################################################
# Machine learning estimation - Gradient Boosting Machines

gbm_co2_Model <- gbm(CO2_PerCap ~ gdpPercap_1000 + year + pop_million + continent,
                        data = WorldBankTrain,
                        distribution = "gaussian",
                        interaction.depth = 40,
                        shrinkage = 0.05,
                        n.trees = 2000,
                        n.minobsinnode = 10)


# Performance Test
gbm_co2_ModelHatV <- predict(gbm_co2_Model, newdata = WorldBankValid,
                            n.trees = 1000)
ValidationRsquared(WorldBankValid$CO2_PerCap, gbm_co2_ModelHatV)
## [1] 0.9405641
# Visualization of Machine Learning Estimation
plotmo(gbm_co2_Model)
##  plotmo grid:    gdpPercap_1000 year pop_million continent
##                        11.70814 2012    7.912398    Africa

# compare predictions to actual results
prediction_results_gbm <- data.frame(WorldBankValid$CO2_PerCap, gbm_co2_ModelHatV, 
                                        WorldBankValid$Country.Name, WorldBankValid$year) 
colnames(prediction_results_gbm) <- c("Actual CO2 pp", "GBM prediction CO2 pp", 
                                      "Country", "Year")
head(prediction_results_gbm, n=10L)
##    Actual CO2 pp GBM prediction CO2 pp                  Country Year
## 1      5.3194705            6.52236822      Antigua and Barbuda 2006
## 2      8.9615694            9.72331231                  Austria 2006
## 3      0.2547524            0.19296383               Bangladesh 2006
## 4      5.2977746            9.58067140                 Barbados 2006
## 5      0.0249742            0.06796703                  Burundi 2006
## 6      0.3813720            0.23996058                 Cameroon 2006
## 7     16.6119281           15.48849908                   Canada 2006
## 8      0.0631363            0.45428269 Central African Republic 2006
## 9      3.4883351            5.55888444                    Chile 2006
## 10     1.3360832            1.61794033                 Colombia 2006